1 Relationships in R

We employ different plots to demostrate and explore the relationships.

Plots we use to viz the relationship

2 Two variables

  • scatter plot
data(airquality)
str(airquality)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

plot the relationship between Wind and Temperature.

airquality %>%               # the data
  ggplot(aes(Wind, Temp)) +  # aes layer
  geom_point()               # the Geom_layer

  • Your TRUN:

Try to plot the relationship between Solar.R and Temp

  • line graph
airquality %>% 
  ggplot(aes(Wind, Temp)) +
  geom_line()

  • In addition, we can always mapping other variables, with
    • color
    • size
    • fill
    • alpha which then demostrate the relaitonships among more than 2 variables.
  • using pkgGGally to viz multiple pairs of the correlations.
library(GGally)

ggpairs(airquality)

# just for demonstration, you can do some variable selection before run ggpairs.

3 More than two variables

Use bubble plot, to mapping the third quantitative variable in the size of the area.

  • bubble plot
airquality %>% 
  ggplot(aes(Wind, Temp, size = Solar.R)) + 
  geom_point()

Your can polishing the plot by customize the shape,color,fill, etc.

# airquality %>% 
#   ggplot(aes(Wind, Temp, size = Solar.R)) + 
#   geom_point(shape = 21, fill = "red") +
#   scale_size_continuous(range = c(1,10))
  • bubble plots with lable

We will use another more complicated data set to demonstrate.

data(mtcars)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...

Please refer the webpage on the detail explanation of each of the variable.

library(ggrepel)

ggplot(data=mtcars, aes(x=wt,y=mpg))+
  geom_point(aes(size=disp,fill=disp),shape=21,colour="black",alpha=0.8)+
  scale_fill_gradient2(low="#377EB8",high="#E41A1C",midpoint = mean(mtcars$disp))+
  geom_text_repel(label = mtcars$disp )+
  scale_size_area(max_size=12)+
  guides(size = guide_legend((title="Value")),
         fill = guide_legend((title="Value")))+
  theme(
    legend.text=element_text(size=10,face="plain",color="black"),
    axis.title=element_text(size=10,face="plain",color="black"),
    axis.text = element_text(size=10,face="plain",color="black"),
    legend.position = "right"
  )

  • different shapes of bubbles.
ggplot(mtcars, aes(wt,mpg))+
  geom_point(aes(size=disp,fill=disp),shape=22,colour="black",alpha=0.8)+
  scale_fill_gradient2(low=brewer.pal(7,"Set1")[2],high=brewer.pal(7,"Set1")[1],
                       midpoint = mean(mtcars$disp))+
  scale_size_area(max_size=12)+
  guides(fill = guide_legend((title="Value")),
         size =  guide_legend((title="Value")))+
  theme(
    text=element_text(size=15,color="black"),
    plot.title=element_text(size=15,family="myfont",face="bold.italic",color="black")#,
    #legend.position=c(0.9,0.05)
  )

4 3D surface plots

library(plotly)
data("volcano")
str(volcano)
##  num [1:87, 1:61] 100 101 102 103 104 105 105 106 107 108 ...

Dataset volcano is the topographic information on Auckland’s Maunga Whau Volcano: the data set gives the height of the volcano in a 10*10 meter grid.

For the matrix gives the z of the volcano, so it’s quite easy to draw a surface 3D.

fig <- plot_ly(z = volcano)
fig <- fig %>% 
  add_surface()
fig

or just one line:

plot_ly(z = ~volcano, type = "surface")

5 Multiple variables

  • Chord diagram
library(circlize) 
library(RColorBrewer) 
set.seed(999) 
mat = matrix(sample(18, 18), 3, 6) 
rownames(mat) = paste0("S", 1:3) 
colnames(mat) = paste0("E", 1:6) 
df = data.frame(from = rep(rownames(mat), 
                           times = ncol(mat)), 
                to = rep(colnames(mat), 
                         each = nrow(mat)), 
                value = as.vector(mat), 
                stringsAsFactors = FALSE) 
chordDiagram(df,
             grid.col = brewer.pal(9,"Set1")[1:9],
             link.border="grey") # use data.frame to draw chord diagram 
circos.clear() 
chordDiagram(mat,
             grid.col = brewer.pal(9,"Set1")[1:9],
             link.border="grey") # using matrix

circos.clear()
  • Sankey diagram
library(ggalluvial) # alluvial
library(ggplot2) 
data(vaccinations) 
levels(vaccinations$response) <- rev(levels(vaccinations$response)) 
ggplot(vaccinations, 
       aes(x = survey, 
           stratum = response, 
           alluvium = subject, 
           weight = freq, 
           fill = response, 
           label = response)) + 
  geom_flow(alpha = 0.7,
            color = "darkgray") + 
  geom_stratum(alpha = 1) + 
  geom_text(stat = "stratum", 
            size = 3.5) + 
  theme_classic()+ #coord_flip() + 
  theme(legend.position = "none", 
        axis.text.x =element_text(color="black",
                                  size=12), 
        axis.title.x = element_blank(), 
        axis.text.y =element_blank(), 
        axis.line = element_blank(), 
        axis.ticks =element_blank() ) +
  ggtitle("Vaccination Survey responses at three points in time")

The sankey diagram for lj dataset.

library(ggalluvial)
load("/Users/jameschen/Documents/02_Teaching/06_r4ds/slides/data/lj_sh_2019.RData")
library(showtext)
showtext_auto()
lj %>% 
  count(line,directions1,decoration,hml) %>% 
  ggplot(aes(y=n,
             axis1 = line, 
             axis2 = directions1, 
             axis3 = decoration,
             axis4 = hml)) +
  geom_flow(alpha = 0.7, colour = "darkgray",aes(fill = as.factor(line))) +
  scale_x_discrete(limits = c("line","directions1","decoration","hml")) +
  geom_stratum(alpha = 1) +
  geom_text(stat = "stratum", size = 2, aes(label = after_stat(stratum))) +
  ggtitle("Alluvial plot of Lianjia Secondhand houses 2019") 

  theme_set(theme(text=element_text(family="Songti SC",size=12,face = "bold")))

Notes:

  • Chinese character: in this example, the theme_set way to solve the Chinese character is not take effect; Can use showtext_auto in the showtext() pkg.

  • colors in the flow: use fill to define the color of flow.

  • compare the two Sankey. the stratum in the first Sankey is the same, so we can define the color by fill = response, in the second Sankey, different stratum, but we can also specify a color by the argument of ill in the geom_flow function.